import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn import metrics
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd

from preprocess import conv_dict, conv_data


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train = train.drop(['ID'], axis=1)
test = test.drop(['ID'], axis=1)

train_dict = conv_dict(train)
train = conv_data(train, train_dict)
test = conv_data(test, train_dict)

null_index = train['Credit_Product'].isnull()
test_null = train.loc[null_index.index[null_index]]
print(test_null.info())
y_null = np.array(test_null['Is_Lead'])
x_null = test_null.drop('Is_Lead', 1)
y = np.array(train['Is_Lead'])
x = np.array(train.drop('Is_Lead', 1))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=18)

train_os = train.dropna()
y_os = np.array(train_os['Is_Lead'])
x_os = np.array(train_os.drop('Is_Lead', 1))

over_sampler = SMOTE(random_state=0)
x_os, y_os = over_sampler.fit_resample(x_os, y_os)

x_train = np.concatenate((x_train, x_os))
y_train = np.concatenate((y_train, y_os))
print(x_train.shape, y_train.shape)
params = {
    'booster': 'gbtree',
    'eval_metric': 'auc',
    'objective': 'binary:logistic',
    'n_estimators': 500,
    'eta': 0.01,
    'learning_rate': 0.1,
    'max_depth': 5,
    'min_child_weight': 6,
    'seed': 0,
    'subsample': 0.85,
    'colsample_bytree': 0.8,
    'gamma': 0.8,
    'reg_alpha': 0.4,
    'reg_lambda': 0.7
}

xgb_train = xgb.DMatrix(x_train, label=y_train)
xgb_test = xgb.DMatrix(x_test, label=y_test)
watchlist = [(xgb_train, 'train'), (xgb_test, 'test')]

num_round = 5000
bst = xgb.train(params, xgb_train, num_round, watchlist, early_stopping_rounds=200, verbose_eval=50)
bst.save_model('test.model')
pred = bst.predict(xgb_test)

y_pred = (pred >= 0.5) * 1
print('AUC: %.4f' % metrics.roc_auc_score(y_test, pred))
print('ACC: %.4f' % metrics.accuracy_score(y_test, y_pred))
print('Recall: %.4f' % metrics.recall_score(y_test, y_pred))
print('F1-score: %.4f' % metrics.f1_score(y_test, y_pred))
print('Precesion: %.4f' % metrics.precision_score(y_test, y_pred))

test_arr = xgb.DMatrix(test)
preds_test = bst.predict(test_arr)

pred_null = bst.predict(xgb.DMatrix(x_null))
print('NULL-AUC: %.4f' % metrics.roc_auc_score(y_null, pred_null))

np.savetxt('./xgboost_ans.csv', preds_test, fmt='%.8f', delimiter=',')


